import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import datasets
features, tar = datasets.load_diabetes(return_X_y=True)
from sklearn import datasets
import pandas as pd
diabetes = datasets.load_diabetes()
diabetes_df = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)
diabetes_df['target'] = diabetes.target
diabetes_df.head()
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019907 | -0.017646 | 151.0 |
| 1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068332 | -0.092204 | 75.0 |
| 2 | 0.085299 | 0.050680 | 0.044451 | -0.005670 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002861 | -0.025930 | 141.0 |
| 3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022688 | -0.009362 | 206.0 |
| 4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031988 | -0.046641 | 135.0 |
diabetes_df = diabetes_df.rename(columns={'s1':'total serum cholesterol','s2':'low-density lipoproteins', 's3':'high-density lipoproteins','s4':'total cholesterol / HDL','s5':'possibly log of serum triglycerides level','s6':'blood sugar level'})
diabetes_df.head()
| age | sex | bmi | bp | total serum cholesterol | low-density lipoproteins | high-density lipoproteins | total cholesterol / HDL | possibly log of serum triglycerides level | blood sugar level | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019907 | -0.017646 | 151.0 |
| 1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068332 | -0.092204 | 75.0 |
| 2 | 0.085299 | 0.050680 | 0.044451 | -0.005670 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002861 | -0.025930 | 141.0 |
| 3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022688 | -0.009362 | 206.0 |
| 4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031988 | -0.046641 | 135.0 |
diabetes_df.shape
(442, 11)
diabetes_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 442 entries, 0 to 441 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 442 non-null float64 1 sex 442 non-null float64 2 bmi 442 non-null float64 3 bp 442 non-null float64 4 total serum cholesterol 442 non-null float64 5 low-density lipoproteins 442 non-null float64 6 high-density lipoproteins 442 non-null float64 7 total cholesterol / HDL 442 non-null float64 8 possibly log of serum triglycerides level 442 non-null float64 9 blood sugar level 442 non-null float64 10 target 442 non-null float64 dtypes: float64(11) memory usage: 38.1 KB
print(diabetes_df['target'].unique())
print(diabetes_df['target'].value_counts())
[151. 75. 141. 206. 135. 97. 138. 63. 110. 310. 101. 69. 179. 185.
118. 171. 166. 144. 168. 68. 49. 245. 184. 202. 137. 85. 131. 283.
129. 59. 341. 87. 65. 102. 265. 276. 252. 90. 100. 55. 61. 92.
259. 53. 190. 142. 155. 225. 104. 182. 128. 52. 37. 170. 71. 163.
150. 160. 178. 48. 270. 111. 42. 200. 113. 143. 51. 210. 134. 98.
164. 96. 162. 279. 83. 302. 198. 95. 232. 81. 246. 297. 258. 229.
275. 281. 173. 180. 84. 121. 161. 99. 109. 115. 268. 274. 158. 107.
103. 272. 280. 336. 317. 235. 60. 174. 126. 288. 88. 292. 197. 186.
25. 195. 217. 172. 214. 70. 220. 152. 47. 74. 295. 127. 237. 64.
79. 91. 116. 86. 122. 72. 39. 196. 222. 277. 77. 191. 73. 263.
248. 296. 78. 93. 208. 108. 154. 124. 67. 257. 262. 177. 187. 125.
215. 303. 243. 153. 346. 89. 50. 308. 145. 45. 264. 241. 66. 94.
230. 181. 156. 233. 219. 80. 332. 31. 236. 253. 44. 114. 147. 242.
249. 192. 244. 199. 306. 216. 139. 148. 54. 221. 311. 321. 58. 123.
167. 140. 40. 132. 201. 273. 43. 175. 293. 189. 209. 136. 261. 146.
212. 120. 183. 57.]
target
200.0 6
72.0 6
90.0 5
178.0 5
71.0 5
..
73.0 1
222.0 1
86.0 1
79.0 1
57.0 1
Name: count, Length: 214, dtype: int64
sns.jointplot(x='bp',y='bmi',data=diabetes_df,kind='hex')
<seaborn.axisgrid.JointGrid at 0x2067eb0fbd0>
sns.jointplot(x='blood sugar level',y='age',data=diabetes_df,kind='reg')
<seaborn.axisgrid.JointGrid at 0x2067ea9edd0>
sns.pairplot(diabetes_df, hue='sex')
<seaborn.axisgrid.PairGrid at 0x2067eb0b650>
diabetes_x, diabetes_y = datasets.load_diabetes(return_X_y=True, as_frame=False)
diabetes_df = pd.DataFrame(diabetes_x, columns=datasets.load_diabetes().feature_names)
diabetes_df['target'] = diabetes_y
diabetes_X = diabetes_x[:, np.newaxis, 2]
diabetes_y = diabetes_df["target"]
X_train, X_test, y_train, y_test = train_test_split(diabetes_X, diabetes_y, test_size=0.2, random_state=0)
diabetes_model = LinearRegression()
diabetes_model.fit(X_train, y_train)
y_pred = diabetes_model.predict(X_test)
plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color='green', label='Test Data')
plt.scatter(X_train, y_train, color='blue', label='Trained Data')
plt.plot(X_test, y_pred, color='red', linewidth=4, label='Predicted')
plt.xlabel('Body Mass Index')
plt.ylabel('Disease Progression')
plt.title('Univariate : Linear Regression: Body Mass Index vs. Disease Progression')
plt.legend()
plt.show()
print("Coefficients : %.2f" % diabetes_model.coef_[0])
print("Intercept: %.2f" % diabetes_model.intercept_,"\n")
test_MSE = metrics.mean_squared_error(y_test, y_pred)
train_MSE = metrics.mean_squared_error(y_train, y_pred = diabetes_model.predict(X_train))
test_MAE = metrics.mean_absolute_error(y_test, y_pred)
train_MAE = metrics.mean_absolute_error(y_train, y_pred = diabetes_model.predict(X_train))
print(f"Mean Absolute Error of trained model : {test_MSE:.2f}")
print(f"Mean Absolute Error of Test model : {train_MSE:.2f}","\n")
print(f"Mean Squared Error of trained model : {test_MAE:.2f}")
print(f"Mean Squared Error of Test model : {train_MAE:.2f}")
Coefficients : 981.66 Intercept: 152.29 Mean Absolute Error (Train) : 51.32 Mean Absolute Error (Test) : 52.94 Mean Squared Error (Train) : 3827.82 Mean Squared Error (Test) : 4150.68
We observe a Mean Absolute Error indicating that the model is not performing optimally. Linear regression is not suitable for this dataset due to its high level of dispersion